Load input files
EIC_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo/EICv2/analytics_output.txt")
ECS_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo/ECS/analytics_output.txt")
CS2_processed_contigs <- read.delim("/home/users/allstaff/tonkin-hill.g/assembly_testing/ComparisonForManuscript/SoapDeNovo/CS2/analytics_output.txt")
## Loading required package: RUnit
library(ggplot2)
library(reshape)
plot_contigs <- function(blastData, length_filt, identity_filt, var_gene_lengths){
# blastData <-EIC_processed_contigs
#first filter out unmatched contigs and convert types
blastData <- data.frame(subset(blastData, sequence.id != '-'))
blastData$alignment.length <-as.numeric(as.character(blastData$alignment.length))
blastData$s..start <-as.numeric(as.character(blastData$s..start))
blastData$s..end <-as.numeric(as.character(blastData$s..end))
blastData$q..start <-as.numeric(as.character(blastData$q..start))
blastData$q..end <-as.numeric(as.character(blastData$q..end))
blastData$percent.identity <-as.numeric(as.character(blastData$percent.identity))
#now filter out hits we aren't interested in
#blastData <- subset(blastData, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
#blastData <- subset(blastData, alignment.length > 800) #throw away contigs less than 3 reads ~ length residues long
#blastData <- subset(blastData, percent.identity > identity_filt) #throw away alignments that share less than x%
#now split into seperate data frames for each database
var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
#var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
#filter out human and Pf3D7 contigs that aren't matched to any var genes
var_contigs <- unique(var_hits$contig.id)
human_hits <- subset(human_hits, contig.id %in% var_contigs)
Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
#now annoying stuff so ggplot doesnt get upset (give each contig a different height?)
ydf <- data.frame(contig.id=unique(var_hits$contig.id) , y=as.numeric(factor(unique(var_hits$contig.id))))
var_hits <- merge(var_hits, ydf, by="contig.id")
human_hits <- merge(human_hits, ydf, by="contig.id")
Pf3D7_hits <- merge(Pf3D7_hits, ydf, by="contig.id")
#Now generate the underlying contigs (full length)
contig <- var_hits
contig$s..start <- contig$s..start - contig$q..start
contig$s..end <- contig$s..start+contig$contig.length
#now change the Pf3D7 sequence to be in terms of the var reference VERY HACKY
diff <- data.frame(contig.id=contig$contig.id, diff=contig$s..start, diff_seq=contig$sequence.id)
diff <- subset(diff, !duplicated(contig.id))
diff <- merge(diff, Pf3D7_hits, by="contig.id")
Pf3D7_hits$s..start <- diff$diff + Pf3D7_hits$q..start
Pf3D7_hits$s..end <- diff$diff + Pf3D7_hits$q..end
Pf3D7_hits$sequence.id <- diff$diff_seq
#get the gene lengths from rask data
var_gene_lengths <- var_gene_lengths[var_gene_lengths$sequence.id %in% intersect(var_gene_lengths$sequence.id, contig$sequence.id),]
gg <- ggplot(data=contig, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id))) + geom_rect( alpha=0.4)
gg <- gg + geom_rect(data=var_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1, fill=factor(contig.id), labels = rpk))
#gg <- gg + geom_rect(data=Pf3D7_hits, aes(xmin = s..start, xmax = s..end, ymin = y, ymax = y+1), color="black", alpha=0.5)
gg <- gg + geom_text(data=var_hits, aes(x=s..start+(s..end-s..start)/2, y=y+(y+1-y)/2, label=paste(contig.id,rpk, sep=" ")), size=8)
gg <- gg + geom_vline(data = var_gene_lengths, aes(xintercept = gene.length))
gg <- gg + facet_wrap(~sequence.id)
gg <- gg + xlab("location on reference sequence")
gg <- gg + ylab("") + scale_y_continuous(breaks=NULL)
gg <- gg + mytheme()
print(gg)
#paste(percent.identity,mismatches,gap.opens, sep=" ")
rpk_counts <- aggregate(. ~ sequence.id, data=var_hits, sum)
rpk_counts <- rpk_counts[,(names(rpk_counts) %in% c('sequence.id','rpk'))]
names(rpk_counts) <- c("var.name", "contig.count")
return(rpk_counts)
}
Function to calculate redundancy
library(data.table)
library(IRanges)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, as.vector, cbind, colnames, do.call,
## duplicated, eval, evalq, get, intersect, is.unsorted, lapply,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, rank, rbind, rep.int, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unlist, unsplit
##
## Loading required package: S4Vectors
## Loading required package: stats4
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:reshape':
##
## rename
##
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:reshape':
##
## expand
library(GenomicRanges)
## Loading required package: GenomeInfoDb
redundancy <- function(blastData, length_filt, var_gene_lengths, identity_filt=97){
# blastData <-CS2_processed_contigs
#first filter out unmatched contigs and convert types
blastData <- data.frame(subset(blastData, sequence.id != '-'))
blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
blastData$s..start <- as.numeric(as.character(blastData$s..start))
blastData$s..end <- as.numeric(as.character(blastData$s..end))
blastData$q..start <- as.numeric(as.character(blastData$q..start))
blastData$q..end <- as.numeric(as.character(blastData$q..end))
blastData$percent.identity <- as.numeric(as.character(blastData$percent.identity))
blastData$alignment.length <- as.numeric(as.character(blastData$alignment.length))
blastData$bit.score <- as.numeric(as.character(blastData$bit.score))
#now split into seperate data frames for each database
var_hits <- subset(blastData, grepl("blastHits_Full_var_rask2010*",data.base.name))
var_hits <- subset(var_hits, alignment.length > length_filt) #throw away contigs less than 3 reads ~ length residues long
var_hits <- subset(var_hits, percent.identity > identity_filt) #throw away alignments that share less than x%
#var_hits <-subset(var_hits, (sequence.id == 'IT4var04') | (sequence.id == 'IT4var08'))
human_hits <- subset(blastData, grepl("blastHits_human_reference*",data.base.name))
Pf3D7_hits <- subset(blastData, grepl("blastHits_Pf3D7_reference*",data.base.name))
Pf3D7_hits <- subset(Pf3D7_hits, alignment.length > 100) #throw away contigs less than 3 reads ~ length residues long
Pf3D7_hits <- subset(Pf3D7_hits, percent.identity > 85) #throw away alignments that share less than x%
#filter out human and Pf3D7 contigs that aren't matched to any var genes
var_contigs <- unique(var_hits$contig.id)
human_hits <- subset(human_hits, contig.id %in% var_contigs)
Pf3D7_hits <- subset(Pf3D7_hits, contig.id %in% var_contigs)
#sort var hit by name then bit score
var_hits <- var_hits[with(var_hits, order(contig.id, bit.score, decreasing=TRUE)), ]
var_hits <- droplevels(var_hits)
#remove duplicated contigs which have overlapping query regions.
VH <- as.data.table(var_hits)
VH$q..start <- VH$q..start+250
VH$q..end <- VH$q..end-250
VH[,group := {
ir <- IRanges(q..start, q..end);
subjectHits(findOverlaps(ir, reduce(ir), minoverlap=0))
},by=contig.id]
VH$q..start <- VH$q..start-250
VH$q..end <- VH$q..end+250
VH <- data.frame(VH)
VH <- VH[!duplicated(VH[,colnames(VH) %in% c('contig.id', 'group')]),]
VH$group <- NULL
total.alignment <- sum(VH$alignment.length)
gr = GRanges(VH$sequence.id,
IRanges(VH$s..start, VH$s..end),
strand="*")
re <- reduce(gr)
total.coverage <- sum(width(ranges(re)))
redundancy <- total.alignment/total.coverage
return(redundancy)
}
Load more data
#expression data
var_expressions_mike <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/var_expressions_mike.csv")
#read count data
read_counts_against_IT4_genes <- read.csv("/wehisan/home/allstaff/t/tonkin-hill.g/assembly_testing/counts/read_counts_against_IT4_genes.csv")
CS2_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$CS2, 'counts'=read_counts_against_IT4_genes$counts)
ECS_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$ECS, 'counts'=read_counts_against_IT4_genes$counts.1)
EIC_read_counts <- data.frame('var name'=read_counts_against_IT4_genes$EIC, 'counts'=read_counts_against_IT4_genes$counts.2)
#gene length data
var_gene_lengths <- read.delim("/wehisan/home/allstaff/t/tonkin-hill.g/find_var_genes/assemble_var/data/var_rask_lengths.txt", header=F)
colnames(var_gene_lengths) <- c("sequence.id", "gene.length")
var_gene_lengths <- data.frame(var_gene_lengths)
For the contig plots we restrict the contigs to those that are longer than 1000 residues and share more than 95% similarity with the reference. Any contig that has overlapped with human, falciparum or vivax (non-var) by more than 30% has also been removed. In each of the contig plots below the lighter shaded blocks represent the contigs whilst the dark colouring represents those parts of the contigs that have aligned to the reference. If contigs map to more than one reference gene they can appear more than once.
Lets look at a plot of the contigs for ECS.
contig_counts <- plot_contigs(ECS_processed_contigs, 800, 95, var_gene_lengths)
Next we look at the redundancy of this assembly. Redundancy is calculated by first filtering contigs to those that align to VAR with a length of at least 800 and identity of 97. We then only allow each region of each transcript to only map to one reference region. This is done rather harshly in that if two blast hit overlap from on the same region of the transcript we take the one with the high bit score. Reduncdancy is then calculated as \[ \frac{\text{total alignment length}}{\text{total number of nucleotide in reference that are covered by hits}} \]
redundancy(ECS_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1.000081
For the expression plots I first aligned all the reads for each sample onto all the IT4var genes. This is reffered to as ‘whole gene count’. I then aligned all the reads for each sample onto the contigs output by our assembly pipeline. This is reffered to as ‘contig count’. Finally expression count are the 2exp-deltadeltaCt numbers from the quantification assay. All these counts are normalised and then plotted on a log10 scale. Thus each point on the y-axis can be interpreted as 0.1% of the total for that count type and that sample.
plot expression for ECS
ecs_df <- merge(var_expressions_mike, ECS_read_counts, by='var.name')
ecs_df <- ecs_df[,!(names(ecs_df) %in% c('CS2','EIC'))]
ecs_df$counts <- ecs_df$counts/sum(ecs_df$counts)
ecs_df$ECS <- ecs_df$ECS/sum(ecs_df$ECS)
ecs_df <- merge(ecs_df, contig_counts, all.x=TRUE, by='var.name')
ecs_df$contig.count[is.na(ecs_df$contig.count)] <- 0
ecs_df$contig.count <- ecs_df$contig.count/sum(ecs_df$contig.count)
names(ecs_df) <- c('Rask_name','Expression_name','expression count','whole gene count','contig count')
ecs_df2 <- melt(ecs_df, id=c('Rask_name','Expression_name'))
#ecs_df2$value[1000*ecs_df2$value < 1] <- 1/1000
ecs <- ggplot(ecs_df2, aes(Rask_name, fill=factor(variable), weight=100*value)) + geom_bar(position="dodge")
ecs <- ecs + mybartheme() + scale_y_continuous(trans = 'asinh', breaks=c(1,10,100))
ecs <- ecs + ylab("Proportion of total expression\n%") + xlab("Gene Name")
ecs
Lets look at a plot of the contigs for CS2
contig_counts <- plot_contigs(CS2_processed_contigs, 800, 95, var_gene_lengths)
Redundancy
redundancy(CS2_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1.000568
plot expression for CS2
cs2_df <- merge(var_expressions_mike, CS2_read_counts, by='var.name')
cs2_df <- cs2_df[,!(names(cs2_df) %in% c('ECS','EIC'))]
cs2_df$counts <- cs2_df$counts/sum(cs2_df$counts)
cs2_df$CS2 <- cs2_df$CS2/sum(cs2_df$CS2)
cs2_df <- merge(cs2_df, contig_counts, all.x=TRUE, by='var.name')
cs2_df$contig.count[is.na(cs2_df$contig.count)] <- 0
cs2_df$contig.count <- cs2_df$contig.count/sum(cs2_df$contig.count)
names(cs2_df) <- c('Rask_name','Expression_name','expression count','whole gene count','contig count')
cs2_df2 <- melt(cs2_df, id=c('Rask_name','Expression_name'))
#cs2_df2$value[1000*cs2_df2$value < 1] <- 1/1000
cs2 <- ggplot(cs2_df2, aes(Rask_name, fill=factor(variable), weight=100*value)) + geom_bar(position="dodge")
cs2 <- cs2 + scale_y_continuous(limits=c(0,NA))
cs2 <- cs2 + mybartheme() + scale_y_continuous(trans = 'asinh', breaks=c(1,10,100))
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
cs2 <- cs2 + ylab("Proportion of total expression\n%") + xlab("Gene Name")
cs2
Lets look at a plot of the contigs for EIC As their was a higher rate of similarities in EIC I’ve upped the similarity threshold to 97%
contig_counts <- plot_contigs(EIC_processed_contigs, 800, 97, var_gene_lengths)
Redundancy
redundancy(EIC_processed_contigs, 800, var_gene_lengths, 95)
## [1] 1.032246
plot expression for EIC
eic_df <- merge(var_expressions_mike, EIC_read_counts, by='var.name')
eic_df <- eic_df[,!(names(eic_df) %in% c('ECS','CS2'))]
eic_df$counts <- eic_df$counts/sum(eic_df$counts)
eic_df$EIC <- eic_df$EIC/sum(eic_df$EIC)
eic_df <- merge(eic_df, contig_counts, all.x=TRUE, by='var.name')
eic_df$contig.count[is.na(eic_df$contig.count)] <- 0
eic_df$contig.count <- eic_df$contig.count/sum(eic_df$contig.count)
names(eic_df) <- c('Rask_name','Expression_name','expression count','whole gene count','contig count')
eic_df2 <- melt(eic_df, id=c('Rask_name','Expression_name'))
# eic_df2$value[1000*eic_df2$value < 1] <- 1/1000
eic <- ggplot(eic_df2, aes(Rask_name, fill=factor(variable), weight=100*value)) + geom_bar(position="dodge")
eic <- eic + scale_y_continuous(limits=c(0,NA))
eic <- eic + mybartheme() + scale_y_continuous(trans = 'asinh', breaks=c(1,10,100))
## Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
eic <- eic + ylab("Proportion of total expression\n%") + xlab("Gene Name")
eic
Now we calculate some summary statistics for this comparison. First the number of IT4var genes found and the total found by qPCR above 1e-3.
countQPCRExpressed <- sum(eic_df$'expression count'>1e-3)
countContigExpressed <- sum(eic_df$'contig count'>0)
A total of 44 were found above a threshold of \(1e-3\) by qPCR. After assembly a total of 31 genes were identified.
Now we look at the overall similarity of coverage. This is rather sensitive to single genes with large differences in exression profiles between the methods.
qPCR <- eic_df$'expression count'
qPCR[qPCR<1e-3] <- 0
contig <- eic_df$'contig count'
The mean percentage difference in expression profiles is 1.6342005